# only need to execute once !!!
install.packages("webshot")
webshot::install_phantomjs()
pacman::p_load(dplyr, ggplot2, tm, SnowballC, wordcloud2, RColorBrewer, 
               plotly, stringr, d3heatmap, htmlwidgets,readr, maps, Matrix)


【A】News Article Corpus Summary

load("data/X.rdata")
summary(X)
     url                                  sub            date           
 Length:10760       Business & Finance      :2249   Min.   :2010-02-17  
 Class :character   R&D                     :1857   1st Qu.:2013-06-03  
 Mode  :character   Grid Connection         :1319   Median :2015-03-23  
                    Authorities             :1131   Mean   :2015-03-05  
                    Technology              :1077   3rd Qu.:2017-03-02  
                    Operations & Maintenance: 947   Max.   :2019-04-12  
                    (Other)                 :2180                       
    title             abstract            author              tags          
 Length:10760       Length:10760       Length:10760       Length:10760      
 Class :character   Class :character   Class :character   Class :character  
 Mode  :character   Mode  :character   Mode  :character   Mode  :character  
                                                                            
                                                                            
                                                                            
                                                                            
     text                rov        
 Length:10760       Min.   : 0.000  
 Class :character   1st Qu.: 0.000  
 Mode  :character   Median : 0.000  
                    Mean   : 0.033  
                    3rd Qu.: 0.000  
                    Max.   :13.000  
                                    
par(cex=0.8, mar=c(6,4,4,2))
hist(X$date, "year", freq=T, main="No. Articles per Year", las=2, xlab="")

par(cex=0.8, mar=c(4,12,4,2))
table(X$sub) %>% sort %>% 
  barplot(las=2, horiz=T, main="No. Articles per Subject", xlab="freq")

X %>% 
  mutate(year = as.integer(format(date,"%Y"))) %>% 
  group_by(year, sub) %>% count %>% 
  ggplot(aes(x=year, y=n, fill=sub)) + 
  geom_bar(stat="identity", position="fill") +
  scale_x_continuous(breaks=2009:2019) -> p
ggplotly(p)


【B】Wordcloud by Subject

stops = c(
  stopwords("en"), "offshore", "wind", "energy", "will", 
  "said", "also", "can")

WC = function(subject, output, ROV=FALSE, min.freq=25, xstop=c(), ...) {
  X1 = X %>% filter(sub == subject)
  if(ROV) X1 = subset(X1, str_count(text, "ROV|ROUV")>0)
  txt = iconv(X1$text, "latin1", "ASCII", sub="")
  docs <- Corpus(VectorSource(txt))
  docs <- tm_map(docs, content_transformer(tolower)) 
  docs <- tm_map(docs, removeNumbers)     # 
  docs <- tm_map(docs, removeWords,c(stops, xstop)) # 
  docs <- tm_map(docs, removePunctuation) # 
  docs <- tm_map(docs, stripWhitespace)   # 

  dtm = TermDocumentMatrix(docs)
  m = as.matrix(dtm)
  v = sort(rowSums(m),decreasing=TRUE)
  d = data.frame(word=names(v), freq=v)
  hw = wordcloud2(subset(d, freq > min.freq), ... )
  saveWidget(hw,"temp.html",selfcontained = F)
  webshot::webshot("temp.html",output,vwidth=800, vheight=600, delay=20)
  d
  }


ROV: R&D
7.png

7.png



ROV: Technology
9.png

9.png



country v.s subject
pat = paste(iso3166$ISOname, collapse="|")
CX = str_extract_all(X$text, regex(pat, ignore.case=TRUE))
N9 = unlist(CX) %>% table %>% sort(dec=T) %>% {.[. > 9]} %>% names
C9 = lapply(CX, match, N9)
df = do.call(rbind, lapply(1:length(C9), function(i) {
  if(length(C9[[i]]) > 0) data.frame(i = i, j = C9[[i]])
  })) 
df = subset(df, complete.cases(df))
df$x = 1
mx = sparseMatrix(i=df$i, j=df$j, x=df$x, dimnames=list(1:10760, N9)) %>% 
  as.data.frame.matrix
A = sapply(split(mx, X$sub), colSums)
z = table(X$sub) %>% sort(dec=T) %>% names
X$sub = factor(X$sub, levels=z)
X$year = format(X$date, "%Y")
mx = sparseMatrix(i=df$i, j=df$j, x=df$x, dimnames=list(1:10760, N9)) %>% 
  as.data.frame.matrix
A = sapply(split(mx, X$sub), colSums)
t(A[1:12,])
                         Germany Denmark China Netherlands Taiwan France Japan
Business & Finance           362     208   248         202    231    190   185
R&D                          138     148   334          62     45     80   117
Grid Connection              296     120    30         112      8     69    22
Authorities                  133     113    64          92    105     83    50
Technology                    91      74    93          43     11     45    79
Operations & Maintenance     119      67    24          60     20     14     8
Vessels                       66      41    54          51     24     12     6
Training & Education          68      51    11          27     18      5    10
Contracts & Tenders           54      33    17          58     65      4    18
Environment                   16      12    10          14     18      1     2
Ports & Logistics             32      24     3          18     17      7     4
Jobs & Recruitment             2       6     3           2      4      0     0
Industry Contribution          6       1     4           5     19      2     5
Wind Farm Update               1       1     1           1      3      0     0
                         Norway Ireland Belgium India United States
Business & Finance          107      70      95    50            34
R&D                          64      70      28   140           102
Grid Connection              95      37      61     4             7
Authorities                  50     157      18    36            65
Technology                   34      28      21    19            14
Operations & Maintenance     32      19      23    16             7
Vessels                      32       3      25     0             6
Training & Education          6      23       0     6             7
Contracts & Tenders           8       0      14     8             3
Environment                   6       6       2    15            10
Ports & Logistics             0      13      12     0             2
Jobs & Recruitment            0       3       0     0             1
Industry Contribution         3       0       1     1             1
Wind Farm Update              0       0       3     0             0


t(A)[,1:12] %>% as.data.frame.matrix %>% d3heatmap(F,F,col="Greens")